from clustergrammer_widget import *
net = Network(clustergrammer_widget)
df = {}
import clustergrammer_groupby as cby
import gene_exp_10x
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
%matplotlib inline
df['ge-ini'] = gene_exp_10x.load_gene_exp_to_df('../data/pbmc3k_filtered_gene_bc_matrices/hg19/')
df['ge-ini'].shape
all_genes = df['ge-ini'].index.tolist()
print(len(all_genes))
keep_genes = [x for x in all_genes if 'RPL' not in x]
keep_genes = [x for x in keep_genes if 'RPS' not in x]
print(len(keep_genes))
df['ge'] = df['ge-ini'].loc[keep_genes]
df['ge'].shape
# Removing Mitochondrial Genes
list_mito_genes = ['MTRNR2L11', 'MTRF1', 'MTRNR2L12', 'MTRNR2L13', 'MTRF1L', 'MTRNR2L6', 'MTRNR2L7',
'MTRNR2L10', 'MTRNR2L8', 'MTRNR2L5', 'MTRNR2L1', 'MTRNR2L3', 'MTRNR2L4']
all_genes = df['ge'].index.tolist()
mito_genes = [x for x in all_genes if 'MT-' == x[:3] or
x.split('_')[0] in list_mito_genes]
print(mito_genes)
keep_genes = [x for x in all_genes if x not in mito_genes]
df['ge'] = df['ge'].ix[keep_genes]
# normalize by UMI count
barcode_umi_sum = df['ge'].sum()
df['ge'] = df['ge'].div(barcode_umi_sum)
net.load_df(df['ge'])
net.normalize(axis='row', norm_type='zscore')
net.swap_nan_for_zero()
df['ge-z'] = net.export_df()
df['ge-z'].shape
net.load_df(df['ge'])
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.normalize(axis='row', norm_type='zscore')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
net.load_file('../data/cell_type_signatures/nm3337_broad_cell_type_sigs.txt')
df['bct-sig'] = net.export_df()
print(df['bct-sig'].shape)
net.load_file('../data/cell_type_signatures/nm3337_narrow_cell_type_sigs.txt')
df['nct-sig'] = net.export_df()
print(df['nct-sig'].shape)
sig_rows = df['bct-sig'].index.tolist()
clean_sig_rows = [x.split('_')[0] for x in sig_rows]
print(len(clean_sig_rows), len(list(set(clean_sig_rows))))
ge_rows = df['ge'].index.tolist()
clean_ge_rows = [x.split('_')[0] for x in ge_rows]
print(len(ge_rows), len(list(set(clean_ge_rows))))
ser_ge_rows = pd.Series(clean_ge_rows)
gene_name_count = ser_ge_rows.value_counts(ascending=False)
duplicate_genes = gene_name_count[gene_name_count > 1].index.tolist()
len(duplicate_genes)
dup_index = {}
new_rows = []
for inst_row in clean_ge_rows:
# add index to non-unique genes
if inst_row in duplicate_genes:
# calc non-unique index
if inst_row not in dup_index:
dup_index[inst_row] = 1
else:
dup_index[inst_row] = dup_index[inst_row] + 1
new_row = inst_row + '_' + str(dup_index[inst_row])
else:
new_row = inst_row
new_rows.append(new_row)
print(len(new_rows))
print(len(list(set(new_rows))))
df['ge-z'].index = new_rows
rows = df['nct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['nct-sig'].index = new_rows
df['nct-sig'].columns.tolist()
rows = df['bct-sig'].index.tolist()
new_rows = [x.split('_')[0] for x in rows]
df['bct-sig'].index = new_rows
# rows = df['ge-z'].index.tolist()
# new_rows = [x.split('_')[0] for x in rows]
# df['ge-z'].index = new_rows
df['pred_cat'], df['sig_sim'], y_info = cby.predict_cats_from_sigs(df['ge-z'], df['bct-sig'],
predict_level='Cell Type', unknown_thresh=0.05)
net.load_df(df['pred_cat'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()
df['ge-cat'] = deepcopy(df['ge'])
df['ge-cat'].shape
# transfer predicted categories to full dataset and add UMI count
cat_cols = df['pred_cat'].columns.tolist()
df['ge-cat'].columns = cat_cols
new_cols = [(x[0], x[1], 'UMI: ' + str(barcode_umi_sum[x[0]])) for x in cat_cols]
df['ge-cat-umi'] = deepcopy(df['ge-cat'])
df['ge-cat-umi'].columns = new_cols
print(df['ge-cat-umi'].shape)
net.load_df(df['ge-cat-umi'])
net.set_cat_color(axis='col', cat_index=1, cat_name='Cell Type: T cells CD8', inst_color='red')
net.filter_N_top(inst_rc='row', N_top=250, rank_type='var')
net.random_sample(axis='col', num_samples=250, random_state=99)
net.normalize(axis='row', norm_type='zscore')
net.clip(lower=-5, upper=5)
net.cluster()
net.widget()